salary = 1999 salary + bonuses in 1000 US dollar
totcomp = 1999 CEO total compensation
tenure = # of years as CEO (=0 if less than 6 months)
age = age of CEO
sales = total 1998 sales revenue of firm i
profits = 1998 profits for firm i
assets = total assets of firm i in 1998
import pandas as pd
import numpy as np
df = pd.read_csv('ceo.csv')
df = df.drop('Unnamed: 7', axis=1)
df.head(5)
df.describe()
import plotly.graph_objects as go
from plotly.subplots import make_subplots
fig = make_subplots(rows=4, cols=2, subplot_titles=(df.columns))
traces = []
for column_name in df.columns:
traces.append(go.Histogram(x=df[column_name].values, name=column_name))
fig.append_trace(traces[0], 1, 1)
fig.append_trace(traces[1], 1, 2)
fig.append_trace(traces[2], 2, 1)
fig.append_trace(traces[3], 2, 2)
fig.append_trace(traces[4], 3, 1)
fig.append_trace(traces[5], 3, 2)
fig.append_trace(traces[6], 4, 1)
fig.update_layout(
title_text='Distributions ', height=1000)
fig.show()
(A) For the variable totcomp compute the common location measures: mean, 5%- trimmed mean, median, upper and lower quartiles, the upper and lower 5%- quantiles. Give an economic interpretation for every location measure.
from scipy.stats import trim_mean
location_measure_result = [['Mean',round(df['totcomp'].mean(),3)],
['5%- trimmed mean',round(trim_mean(df['totcomp'].values, 0.05),3)],
['Median', df['totcomp'].median()],
['Quantile 25%',df['totcomp'].quantile(0.25)],
['Quantile 75%',df['totcomp'].quantile(0.75)],
['Quantile 5%',df['totcomp'].quantile(0.05)],
['Quantile 95%',round(df['totcomp'].quantile(0.95),3)],
['Min', round(df['totcomp'].min(),3)],
['Max', round(df['totcomp'].max(),3)]]
html_text = "<table style='font-size: 11pt; width: 300px;'><tr><th>Location measure</th><th>Value</th></tr>"
for row in location_measure_result:
html_text += '<tr>'
for value in row:
html_text += '<td>' + str(value) + '</td>'
html_text += '</tr>'
html_text += "</table>"
display(HTML(html_text))
#pd.DataFrame(location_measure_result, columns=['Measure', 'value'])
The values of total compansations for CEO are in range from $100$ to $589101$ (min and max values).
Mean value is $8340.058$, median value is $2951.0$, which is far from mean value.
The reason is that we sample of data has a few large outliers (max value = $589101$, Quantile $95\%$ = $24563.3$)
So the mean value is skewed. If we trimm $5%$ from both sides - we get $5%$- trimmed mean value $4637.68$, which is closer to median value.
The most values are between $1575.5$ and $6043.0$ ($25\%$ and $75\%$ quantile)
(b) Plot the empirical cumulative distribution function. Compute and explain in economic terms the following quantities
$i)~\hat{F}^{-1}(0.1)$ and $\hat{F}^{-1}(0.9)$
$ii)~\hat{F}(2000)$ and $1-\hat{F}(4000)$
data = df['totcomp'].values
def ecdf(x):
x = np.sort(x)
def result(v):
return np.searchsorted(x, v, side='right') / x.size
return result
fig = go.Figure()
fig.add_scatter(x=np.unique(data), y=ecdf(data)(np.unique(data)), line_shape='hv')
#fig.add_scatter(x=[2000, 2000], y=[0,1], line_shape='hv')
#display(HTML('<h1 style="text-align: center">The empirical cumulative distribution function</h1>'))
fig.update_layout(title_text="<span style='font-size: 18pt;'>The empirical cumulative distribution function</b></span>")
fig.show()
html_text = '$\hat{F}^{-1}(0.1)=' + str(round(df['totcomp'].quantile(0.1),5))+'$ - this value is higher than 10% of observed values<br>'
html_text += '$\hat{F}^{-1}(0.9)=' + str(round(df['totcomp'].quantile(0.9),5))+'$ - this value is higher than 90% of observed values<br>'
html_text += '$\hat{F}(2000)=' + str(round(ecdf(data)(2000),5))+'$ - relative number of observations equal to or less than 2000<br>'
html_text += '$1-\hat{F}(4000)=' + str(round(1-ecdf(data)(2000),5))+'$ - relative number of observations higher than 4000<br>'
display(HTML(html_text))
(c) Plot the histogram of totcomp and the Box-plot (or violin-plot). What can be concluded about the distribution of the data? Are the location measures computed above still appropriate? Compute and discuss an appropriate measure of symmetry.
import plotly.graph_objects as go
fig = go.Figure(data=[go.Histogram(x=df['totcomp'].values)])
fig.update_layout(title_text="<span style='font-size: 18pt;'>The histogram of <b>totcomp</b></span>")
fig.show()
import plotly.express as px
fig = px.violin(df, y='totcomp', box=True, # draw box plot inside the violin
points='all', # can be 'outliers', or False
)
fig.update_layout(title_text="<span style='font-size: 18pt;'>The violin plot of <b>totcomp</b></span>")
fig.show()
from scipy.stats import skew
print('Skew value: ', skew(df['totcomp'].values))
As we can see - our distribution is very skewed. We have one very large outlier.
Skewness value $14.79648 > 0$, which means that there is more weight in the right tail of the distribution.
(d) Check which method is used in your software to compute the optimal bandwidth (or the number of bars) in the histogram. Describe it shortly here. Make plots of too detailed and too rough histograms. What can we learn from these figures?
import plotly.express as px
fig = px.histogram(df, x='totcomp', nbins=20)
fig.update_layout(title_text="<span style='font-size: 18pt;'>The rough histogram of <b>totcomp</b></span>", height=500)
fig.show()
fig = px.histogram(df, x='totcomp', nbins=200)
fig.update_layout(title_text="<span style='font-size: 18pt;'>The detailed histogram of <b>totcomp</b></span>", height=500)
fig.show()
I haven't found what is the method of the bandwidth optimization in plotly. Probably, it can be the Shimazaki and Shinomoto's choice
The choice is based on minimization of an estimated $L_2$ risk function:
$$argmin_h \frac{2\hat{m}-v}{h^2}$$
where $\bar{m}$ and $v$ are mean and biased variance of a histogram with bin-width $h$
$$\bar{m}=\frac{1}{k} \sum_{i=1}^{k} m_i$$$$v= \frac{1}{k} \sum_{i=1}^{k} (m_i - \bar{m})^2 $$(e) There are methods which help us make the distribution of the sample more symmetric. Consider the natural logarithm of the total compensation: ln(totcomp). Plot the histogram (and Box-plot) and compare it with the figures for the original data. Compute the mean and the median. What can be concluded from the new values? Provide economic interpretation.
log_values = np.log(df['totcomp'].values)
log_df = pd.DataFrame(log_values, columns=['totcomp'])
mean_log = np.mean(log_values)
median_log = np.median(log_values)
print('After the natural logarithm compensation')
print('Mean : ', mean_log)
print('Median : ', median_log)
fig = go.Figure(data=[go.Histogram(x=log_values, name='Histogram')])
fig.add_trace(go.Scatter(x=[mean_log,mean_log], y=[0,42], name='Mean'))
fig.add_trace(go.Scatter(x=[median_log,median_log], y=[0,42], name='Median'))
fig.update_layout(title_text="<span style='font-size: 18pt;'>The histogram after the natural logarithm compensation for <b>totcomp</b></span>", height=500)
fig.show()
fig = px.violin(log_df, y='totcomp', box=True, # draw box plot inside the violin
points='all') # can be 'outliers', or False
fig.update_layout(title_text="<span style='font-size: 18pt;'>The violin plot after the natural logarithm compensation for <b>totcomp</b></span>", height=600)
fig.show()
As we can see, log function essentially deemphasizes very large values.
After such compensation, mean and median metrics become almost the same, that also indicates that our sample becomes more centrilized.
2. (a) We suspect that the total compensation of the CEO and other variables are related. Compute the correlation coefficients of Pearson and plot them as a heatmap (correlation map). Discuss the strength of the correlations.
import itertools
def plot_heat_matrix(cm, classes,
title='Pearson correlation',
cmap=plt.cm.YlGnBu, #
bar_title="Pearson correlation",
xlabel='',
ylabel='', show_numbers=False):
"""
This function prints and plots the heatmap.
"""
plt.figure(figsize=(12, 12))
plt.imshow(cm, cmap=cmap)
plt.title(title, fontsize=20)
cb = plt.colorbar()
cb.ax.set_title(bar_title, fontsize=18)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation='vertical', fontsize=18)
plt.yticks(tick_marks, classes, fontsize=18)
plt.ylim((-0.5,6.5))
#plt.legend()
if show_numbers:
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, round(cm[i, j], 4),
horizontalalignment="center", fontsize=14, color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
# plt.ylabel(ylabel, fontsize=18)
# plt.xlabel(xlabel, fontsize=18)
display(HTML('<h2>Pearson correlation</h2>'))
df_cor_pearson = df.corr(method='pearson')
df_cor_pearson
plot_heat_matrix(df_cor_pearson.values, df_cor_pearson.columns,
title='Pearson correlation',
cmap=plt.cm.YlGnBu,
bar_title='Pearson correlation',
xlabel='',
ylabel='', show_numbers=True)
The correlation between total compensation and other variables is comparatively small.
The slight correlation we have only with salary.
What about other variables, the big correlation have sales and profits, which makes sense.
(b) Plot the scatter plots (pairs in R). Conclude if the linear correlation coefficients are appropriate here. Compute now the Spearman’s correlations and make a heatmap. Compare the results with Pearson. What is the rank of the observation totcomp= 6000?
In spearman correlation we use ranks. To compute Rank[$totcomp= 6000$], we sort values from our sample and count how many values are less then 6000. The result:
print('Rank[totcomp= 6000] = ',len(df[df['totcomp']<6000]))
import seaborn as sns
sns.set(style="ticks", color_codes=True)
g = sns.pairplot(df)
As we have large outliers - our scatter plots are flattened. outliers has big influense on Pearson correlation.
display(HTML('<h2>Spearman correlation</h2>'))
df_cor_spearman = df.corr(method='spearman')
df_cor_spearman
plot_heat_matrix(df_cor_spearman.values, df_cor_spearman.columns,
title='Spearman correlation',
cmap=plt.cm.YlGnBu,
bar_title='Spearman correlation',
xlabel='',
ylabel='', show_numbers=True)
Spearman correlation is less sensetive for outliers, that's why we can see real correlation between totcomp and salary, which we cannot notice in Pearson correlation.
Also we can see moderate correlation between totcomp and sales, profits and assets, which also makes sense.
(c) Consider the two subsamples: CEOs younger than 50 and older than 50. Plot for both subsamples overlapping histograms/ecdf’s and discuss the results. What can we learn from the corresponding location and dispersion (!) measures?
young = df[df['age'] <= 50]['totcomp'].values
old = df[df['age'] > 50]['totcomp'].values
fig = go.Figure()
fig.add_trace(go.Histogram(x=old, name='age > 50'))
fig.add_trace(go.Histogram(x=young, name='age <= 50'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.update_layout(title_text="<span style='font-size: 18pt;'>The overlapping histograms for subsamples</span>", height=500)
fig.show()
fig = go.Figure()
fig.add_scatter(x=np.unique(old), y=ecdf(old)(np.unique(old)), line_shape='hv', name='age > 50')
fig.add_scatter(x=np.unique(young), y=ecdf(young)(np.unique(young)), line_shape='hv', name='age <= 50')
#fig.add_scatter(x=[2000, 2000], y=[0,1], line_shape='hv')
fig.update_layout(title_text="<span style='font-size: 18pt;'>The empirical cumulative distribution function</b></span>")
fig.show()
ages_values = [[young.mean(),old.mean()],[np.median(young),np.median(old)],[young.var(), old.var()]]
ages_df = pd.DataFrame(ages_values, columns=['age <= 50', 'age > 50'], index=['Mean', 'Median','Variance'])
ages_df['age <= 50'] = ages_df['age <= 50'].apply(lambda x: '%.3f' % x)
ages_df['age > 50'] = ages_df['age > 50'].apply(lambda x: '%.3f' % x)
ages_df
young.var()
old.var()
young.var()/(old.var()+young.var())
For the age > 50 variance is significaly larger then for the age <= 50.
The median and mean values are also larger for CEO in age > 50.
Consider another grouping of the data. Define the groups:
(a) Aggregate the data to a 2 × 3 contigency table with absolute and with relative frequencies.
A1_S1 = df.query('age <= 50 & salary < 3000')['totcomp'].values
A1_S2 = df.query('age <= 50 & salary >= 3000 & salary < 5000')['totcomp'].values
A1_S3 = df.query('age <= 50 & salary >= 5000')['totcomp'].values
A2_S1 = df.query('age > 50 & salary < 3000')['totcomp'].values
A2_S2 = df.query('age > 50 & salary >= 3000 & salary < 5000')['totcomp'].values
A2_S3 = df.query('age > 50 & salary >= 5000')['totcomp'].values
table = [[A1_S1, A1_S2, A1_S3], [A2_S1, A2_S2, A2_S3]]
abs_table = []
for row in table:
new_abs_row = []
for values in row:
new_abs_row.append(len(values))
new_abs_row.append(sum(new_abs_row))
abs_table.append(new_abs_row)
new_abs_row = []
for i in range(4):
col_sum = 0
for j in range(2):
col_sum += abs_table[j][i]
new_abs_row.append(col_sum)
abs_table.append(new_abs_row)
display(HTML("<h3>Absolute frequencies</h3>"))
pd.DataFrame(abs_table, columns=['S1','S2', 'S3', 'Total'], index=['A1','A2','Total'])
display(HTML("<h3>Relative frequencies</h3>"))
pd.DataFrame(np.array(abs_table) / len(df), columns=['S1','S2', 'S3', 'Total'], index=['A1','A2','Total'])
(b) Give interpretation for the values of $n_{12}, h_{12}, n_1$ and $h_1$.
$n_{12} = 75$ - the number of CEOs older than 50 years, that have salary smaler then $3000$
$h_{12} = 0.167$ - the relative number of CEOs (in relation to all observed) older than 50 years, that have salary smaler then $3000$
$n_1 = 382$ - the number of CEOs that have salary smaler then $3000$
$h_1 = 0.854$ - the relative number of CEOs that have salary smaler then $3000$
(c) Compute an appropriate dependence measure for $S_i$ and $A_j$. What can be con-cluded from its value?
from scipy.stats import chi2_contingency
chi2, p, dof, ex = chi2_contingency(np.array(abs_table)[:2,:3], correction=False)
print('chi2: ', chi2)
print('p: ', p)
1. Simulate (with a fixed seed) a sample of size $n = 100$ from the normal distribution with $μ_1 =10$ and $σ_1^2 =9$.
(a) Plot the histogram and compare it to the density of $N(10,9)$.
np.random.seed(42)
mu, sigma = 10, 9 # mean and standard deviation
normal_distribution = np.random.normal(mu, sigma, 100)
fig = go.Figure(data=[go.Histogram(x=normal_distribution, nbinsx=20)])
fig.update_layout(title_text="<span style='font-size: 18pt;'>The histogram for the normal distribution</span>", height=500)
fig.show()
(b) Now draw a sample yi of size n = 100 from t5. Transform it as follows: $10 + 3\sqrt{3/5} · y_i$ . Plot the histogram and compare the density of $N (10, 9)$. What can be concluded and why this example might be relevant for empirical studies?
import scipy.stats as sts
student_distribution = sts.t.rvs(df=5, size=100)
modified_distribution = student_distribution*(3*np.sqrt(3/5)) + 10
fig = go.Figure(data=[go.Histogram(x=modified_distribution, nbinsx=20)])
fig.update_layout(title_text="<span style='font-size: 18pt;'>The histogram for the modified Student distribution</span>", height=500)
fig.show()
fig = go.Figure()
fig.add_trace(go.Histogram(x=normal_distribution, nbinsx=20, name='Normal distribution'))
fig.add_trace(go.Histogram(x=modified_distribution, nbinsx=20, name='Modified student distribution'))
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(title_text="<span style='font-size: 18pt;'>Comparing histograms for the normal and modified Student distributions</span>", height=500)
fig.show()
2. In practice the data is always very heterogenous. To reflect it we contaminate the data by adding an outlier or a subsample with different characteristics.
(a) To obtain a realistic heterogenous sample add to the original normal data a new sample of size m simulated from $N(20,22)$, i.e. $μ_2 = 20$ and $σ_2 = 4$. The size m will obviously influence the above measures. Vary $m$ from $10$ to $200$. (The resulting sample is said to stem from a mixture normal distribution).
mu, sigma = 20, 4 # mean and standard deviation
m_values = [10,50,100,150,200]
add_distributions = []
for m in m_values:
add_distributions.append(np.random.normal(mu, sigma, m))
(b) Plot Box-plots (or violin plots) and histograms for each subsample individually and for the sample for a few different values of $m$.
from IPython.display import HTML
fig = go.Figure()
for i in range(len(m_values)):
fig.add_trace(go.Violin(x =[m_values[i]]*len(add_distributions[i]),
y=add_distributions[i],
name='m='+str(m_values[i]),
box_visible=True,
meanline_visible=True))
fig.update_layout(title_text="<span style='font-size: 18pt;'>Violin plots for each subsample of size m</span>")
fig.show()
fig = make_subplots(rows=5, cols=2) #, subplot_titles=m_values
traces = []
for i in range(5):
traces.append(go.Histogram(x=add_distributions[i], nbinsx=20, name='m='+str(m_values[i])))
traces.append(go.Violin(x=add_distributions[i], box_visible=True,
meanline_visible=True, name='m='+str(m_values[i])))
fig.append_trace(traces[0], 1, 1)
fig.append_trace(traces[1], 1, 2)
fig.append_trace(traces[2], 2, 1)
fig.append_trace(traces[3], 2, 2)
fig.append_trace(traces[4], 3, 1)
fig.append_trace(traces[5], 3, 2)
fig.append_trace(traces[6], 4, 1)
fig.append_trace(traces[7], 4, 2)
fig.append_trace(traces[8], 5, 1)
fig.append_trace(traces[9], 5, 2)
fig.update_layout(title_text="<span style='font-size: 18pt;'>Distributions for samples with differnt size m</span>", height=1600)
fig.show()
import copy
distributions_with_adds = []
for i in range(5):
new_distribution_with_add = copy.deepcopy(list(normal_distribution))
new_distribution_with_add.extend(list(add_distributions[i]))
distributions_with_adds.append(new_distribution_with_add)
fig = go.Figure()
for i in range(len(m_values)):
fig.add_trace(go.Violin(y=distributions_with_adds[i],
name='m='+str(m_values[i]),
box_visible=True,
meanline_visible=True))
fig.update_layout(title_text="<span style='font-size: 18pt;'>Violin plots for modified distributions</span>")
fig.show()
fig = make_subplots(rows=5, cols=2) #, subplot_titles=m_values
traces = []
for i in range(5):
traces.append(go.Histogram(x=distributions_with_adds[i], nbinsx=20, name='m='+str(m_values[i])))
traces.append(go.Violin(x=distributions_with_adds[i], box_visible=True,
meanline_visible=True, name='m='+str(m_values[i])))
fig.append_trace(traces[0], 1, 1)
fig.append_trace(traces[1], 1, 2)
fig.append_trace(traces[2], 2, 1)
fig.append_trace(traces[3], 2, 2)
fig.append_trace(traces[4], 3, 1)
fig.append_trace(traces[5], 3, 2)
fig.append_trace(traces[6], 4, 1)
fig.append_trace(traces[7], 4, 2)
fig.append_trace(traces[8], 5, 1)
fig.append_trace(traces[9], 5, 2)
fig.update_layout(title_text="<span style='font-size: 18pt;'>Modified distribution by samples with differnt size m</span>", height=1600)
fig.show()
(c) Make animated or interactive graphics (with manipulate, plotly, ggplot,etc.) to visualize the impact of $m$ on the histogram and location measures (added as vertical lines in the graph) of the data.
medians = []
means = []
heights = [14,16,25,35,42]
for i in range(len(distributions_with_adds)):
medians.append(np.median(np.array(distributions_with_adds[i])))
means.append(np.mean(np.array(distributions_with_adds[i])))
fig = go.Figure()
for i in range(5):
fig.add_trace(go.Histogram(x=distributions_with_adds[i], nbinsx=25, name='Histogram for m='+str(m_values[i])))
fig.add_trace(go.Scatter(x=[means[i],means[i]], y=[0,heights[i]], name='Mean for m='+str(m_values[i])))
fig.data[0].visible = True
steps = []
for i in range(int(len(fig.data)/2)):
step = dict(
method="restyle",
args=["visible", [False] * len(fig.data)],
label='m='+str(m_values[i])
)
step["args"][1][i*2] = True
step["args"][1][i*2+1] = True
# Toggle i'th trace to "visible"
steps.append(step)
sliders = [dict(
active=1,
currentvalue={"prefix": "m: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders
)
fig.update_layout(title_text="<span style='font-size: 18pt;'>Modified distribution by samples with differnt size m</span>")
fig.show()
3. Next step is to simulate two dependent data sets. We simulate two samples with a given value of the correlation coefficient.
(a) Let $U ∼N(0,1)$ and $V ∼N(0,1)$. Let$U^∗ =U$ and $V^∗ =ρU+\sqrt{1−ρ2}V$. Prove that $Corr(U^∗,V^∗)=ρ$ and the variances of both variables $U^∗$ and $V^∗$ equal one.
(b) Use the above idea to simulate two samples of size $n = 100$ from a normal distribution with different values of $ρ$. Compute the correlation coefficients of Pearson and of Spearman. Compare the correlation to the original parameter $ρ$ (for example, plot Pearson vs. $ρ$ and Spearman vs. $ρ$)
import scipy.stats as sts
np.random.seed(4)
mu, sigma = 0, 1 # mean and standard deviation
U = np.random.normal(mu, sigma, 100)
V = np.random.normal(mu, sigma, 100)
p_values = np.arange(0, 1.1, 0.1)
modified_V = []
for p in p_values:
V_ = p*U + np.sqrt(1-p*p)*V
modified_V.append(V_)
pearson_values = []
spearman_values = []
for i in range(len(modified_V)):
pearson_values.append(np.corrcoef(U, modified_V[i])[0,1])
spearman_values.append(sts.spearmanr(U, modified_V[i])[0])
fig = make_subplots(rows=1, cols=2, subplot_titles=("Pearson / p", "Spearman / p")) #, subplot_titles=m_values
traces = []
traces.append(go.Scatter(x=p_values,y=pearson_values, name="Pearson / p"))
traces.append(go.Scatter(x=p_values,y=spearman_values, name="Spearman / p"))
fig.append_trace(traces[0], 1, 1)
fig.append_trace(traces[1], 1, 2)
fig.update_xaxes(title_text="P", row=1, col=1)
fig.update_xaxes(title_text="P", row=1, col=2)
fig.update_yaxes(title_text="Pearson", row=1, col=1)
fig.update_yaxes(title_text="Spearman",row=1, col=2)
fig.update_layout(title_text="<span style='font-size: 18pt;'>Comparing correlations</span>")
fig.show()
(b) Make a nonlinear but monotone transformation of $V^∗$, say $exp$ for simplicity. Check the impact of this transformation on the correlation coefficients of Spearman and Pearson. Think about an appropriate visualization of the findings.
modified_V_exp = np.exp(modified_V)
pearson_values_exp = []
spearman_values_exp = []
for i in range(len(modified_V_exp)):
pearson_values_exp.append(np.corrcoef(U, modified_V_exp[i])[0,1])
spearman_values_exp.append(sts.spearmanr(U, modified_V_exp[i])[0])
fig = make_subplots(rows=1, cols=2, subplot_titles=("Pearson / p", "Spearman / p")) #, subplot_titles=m_values
traces = []
traces.append(go.Scatter(x=p_values,y=pearson_values_exp, name="Pearson / p"))
traces.append(go.Scatter(x=p_values,y=spearman_values_exp, name="Spearman / p"))
fig.append_trace(traces[0], 1, 1)
fig.append_trace(traces[1], 1, 2)
fig.update_xaxes(title_text="P", row=1, col=1)
fig.update_xaxes(title_text="P", row=1, col=2)
fig.update_yaxes(title_text="Pearson", row=1, col=1)
fig.update_yaxes(title_text="Spearman",row=1, col=2)
fig.update_layout(title_text="<span style='font-size: 18pt;'>Comparing correlations after exponatial transformation</span>")
fig.show()